*Input Data
use "$root/Data/Original/EV_namelist_new.dta", replace

replace strasse = lower(strasse)

* get rid of spaces between hausnr and hausnr letter for some observations
forvalues num = 0(1)9 {
	foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
		qui replace strasse=subinstr(strasse, "`num' `buch'","`num'`buch'",.)
	}
}

*identify adresses without hausnr
gen has_HausNr = 0
forvalues num=1(1)9 {
	replace has_HausNr = 1 if strpos(strasse, "`num'")
}

* separate strasse and hausnr
* step (1): split strasse into parts
* step (2): find number of non-empty variable for each HH
* step (3): HausNr is the last non empty variable if household has_HausNr==1,
*           all other variables are combined to get address
*			if HH has_HausNr==0 if just put together the whole adress

**There are a couple observations that have commas and points in them, we replace them with spaces
replace strasse=subinstr(strasse, ",", " ",.)

*step 1
split strasse, p(" ")
drop strasse

*Here manual data-processing steps to clean up observations that had data errors were performed. For example some additional spaces are deleted, some misspellings are corrected, addresses with postal holding are corrected (if possible). For data security reasons,
*these steps are suppressed in the replication file, as otherwise the anonymity is no longer guaranteed.


*step 2
gen num_strass_vars = .  // last strasse? variable contains Hausrn
forvalues ii=1(1)8{
	replace num_strass_vars=`ii' if strasse`ii'!=""
}

gen Strasse=strasse1
gen HausNr=""
forvalues ii=2(1)8{
	replace HausNr = strasse`ii' if num_strass_vars==`ii' & has_HausNr==1
	replace Strasse=Strasse+" "+strasse`ii' if num_strass_vars>`ii'
	replace Strasse=Strasse+" "+strasse`ii' if num_strass_vars==`ii' & has_HausNr==0
}
drop num_strass_vars strasse*

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}

*some observations have two numbers separated by a slash (we only consider first number)
split HausNr if nr==. & has_letter==0, p("/") 

replace HausNr=HausNr1 if nr==. & has_letter==0 & HausNr1!=""
drop HausNr1 HausNr2 nr has_letter

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}


*some observations have two numbers, separated by a -
split HausNr if nr==. & has_letter==0, p("-") 

replace HausNr=HausNr1 if nr==. & has_letter==0 & HausNr1!=""
drop HausNr1 HausNr2 nr has_letter

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}
*some observations have two numbers, separated by a +
split HausNr if nr==. & has_letter==0, p("+") 

replace HausNr=HausNr1 if nr==. & has_letter==0 & HausNr1!=""
drop HausNr1 HausNr2 nr has_letter

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}

*The rest of the observations were previously not properly separated
split Strasse if nr==. & has_letter==0, p(.)

destring Strasse2, g(nr2) force

replace Strasse=Strasse1+"asse" if nr==. & has_letter==0 & nr2!=.
replace HausNr=Strasse2 if nr==. & has_letter==0 & nr2!=.

drop Strasse1 Strasse2 Strasse3 Strasse4 nr has_letter nr2

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}

*insert a comma between number and street so that we can process the rest of tbe observations
forvalues num = 0(1)9 {
	foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
		qui replace Strasse=subinstr(Strasse, "`buch'`num'","`buch',`num'",.) if nr==. & has_letter==0
	}
}

split Strasse if nr==. & has_letter==0, p(,)
replace Strasse=Strasse1 if nr==. & has_letter==0 & Strasse2!=""
replace HausNr=Strasse2 if nr==. & has_letter==0 & Strasse2!=""

drop Strasse1 Strasse2 nr has_letter 

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}

**Additional manual corrections were performed for some, that for anonymity reasons are suppressed here. The goal was to have
* each address standardised into one street name and one number. 

destring HausNr, gen(nr) force

gen has_letter = 0
foreach buch in a b c d e f g h i j k l m n o p q r s t u v w x y z {
	replace has_letter = 1 if strpos(HausNr, "`buch'")
}



rename hauptname Nachname
rename stammnummer VID
rename vorname Vorname
rename plz PLZ
rename ort Ort

keep year VID Vorname Nachname  Strasse HausNr PLZ Ort
save "$root/Data/Original/EV_namelist_clean_new.dta", replace



***********************************************************************
*** PREPARE NAMELIST to be ready for merge
***********************************************************************
use "$root/Data/Original/EV_namelist_clean_new.dta", clear
drop Ort

rename VID ID
do "$root/Code/Preparation/Management/Merge_Prepare_Names_01.do"
do "$root/Code/Preparation/Management/Merge_Prepare_Adress_01.do"
rename ID VID

* In EV list it seems to be current adress. Merge with EV list
drop year Adress_withletter
duplicates drop VID Name Adress, force

bys Name Adress: gen temp  = _n
bys Name Adress: gen temp2 = _N
drop if temp2 > 5
reshape wide VID, i(Name Adress) j(temp)
drop temp

save "$root/Data/Original/EV_names_merge_new.dta", replace


